In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
In [2]:
dt = pd.read_csv("ggd-664.csv")
In [3]:
dt.head(10)
Out[3]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
0 Wii Sports Wii 2006.0 Sports Nintendo 41.36 28.96 3.77 8.45 82.53 76.0 51.0 8 322.0 Nintendo E
1 Super Mario Bros. NES 1985.0 Platform Nintendo 29.08 3.58 6.81 0.77 40.24 NaN NaN NaN NaN NaN NaN
2 Mario Kart Wii Wii 2008.0 Racing Nintendo 15.68 12.76 3.79 3.29 35.52 82.0 73.0 8.3 709.0 Nintendo E
3 Wii Sports Resort Wii 2009.0 Sports Nintendo 15.61 10.93 3.28 2.95 32.77 80.0 73.0 8 192.0 Nintendo E
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37 NaN NaN NaN NaN NaN NaN
5 Tetris GB 1989.0 Puzzle Nintendo 23.20 2.26 4.22 0.58 30.26 NaN NaN NaN NaN NaN NaN
6 New Super Mario Bros. DS 2006.0 Platform Nintendo 11.28 9.14 6.50 2.88 29.80 89.0 65.0 8.5 431.0 Nintendo E
7 Wii Play Wii 2006.0 Misc Nintendo 13.96 9.18 2.93 2.84 28.92 58.0 41.0 6.6 129.0 Nintendo E
8 New Super Mario Bros. Wii Wii 2009.0 Platform Nintendo 14.44 6.94 4.70 2.24 28.32 87.0 80.0 8.4 594.0 Nintendo E
9 Duck Hunt NES 1984.0 Shooter Nintendo 26.93 0.63 0.28 0.47 28.31 NaN NaN NaN NaN NaN NaN
In [4]:
dt.dtypes
Out[4]:
Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score          object
User_Count         float64
Developer           object
Rating              object
dtype: object
In [5]:
dt.isna().sum()
Out[5]:
Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64
In [6]:
#x=pd.DataFrame(dt[dt.Year_of_Release.isna()].groupby('Name')['Year_of_Release'].nunique())
In [7]:
#x
In [8]:
dt[dt.Name.isna()]
Out[8]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
659 NaN GEN 1993.0 NaN Acclaim Entertainment 1.78 0.53 0.00 0.08 2.39 NaN NaN NaN NaN NaN NaN
14246 NaN GEN 1993.0 NaN Acclaim Entertainment 0.00 0.00 0.03 0.00 0.03 NaN NaN NaN NaN NaN NaN
In [9]:
dt.dropna(subset=['Name'], inplace=True)
In [10]:
duplicates = dt[dt.duplicated(['Name', 'Platform'])]
duplicates
Out[10]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
1591 Need for Speed: Most Wanted X360 2005.0 Racing Electronic Arts 1.0 0.13 0.02 0.10 1.25 83.0 54.0 8.5 134.0 EA Canada T
4127 Sonic the Hedgehog PS3 NaN Platform NaN 0.0 0.48 0.00 0.00 0.48 43.0 17.0 4.1 176.0 Sonic Team E10+
11716 Need for Speed: Most Wanted PC 2012.0 Racing Electronic Arts 0.0 0.06 0.00 0.02 0.08 82.0 19.0 8.5 525.0 Black Box T
16233 Madden NFL 13 PS3 2012.0 Sports Electronic Arts 0.0 0.01 0.00 0.00 0.01 83.0 22.0 5.5 101.0 EA Tiburon E
In [11]:
dt.loc[dt['Name'] == 'Sonic the Hedgehog']
Out[11]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
257 Sonic the Hedgehog GEN 1991.0 Platform Sega 3.03 0.91 0.26 0.13 4.34 NaN NaN NaN NaN NaN NaN
1745 Sonic the Hedgehog PS3 2006.0 Platform Sega 0.41 0.06 0.04 0.66 1.16 43.0 17.0 4.1 176.0 Sonic Team E10+
1996 Sonic the Hedgehog X360 2006.0 Platform Sega 0.44 0.48 0.00 0.11 1.04 46.0 38.0 4.4 455.0 Sega E10+
4127 Sonic the Hedgehog PS3 NaN Platform NaN 0.00 0.48 0.00 0.00 0.48 43.0 17.0 4.1 176.0 Sonic Team E10+
In [12]:
def adding_sales(dt, row1, row2):
    for x in list(range(5,10)):
        dt.iloc[row1, x] = dt.iloc[row1, x] + dt.iloc[row2, x]
    updated_df = dt.drop(index=row2)
    return updated_df
In [13]:
dt = adding_sales(dt,1745, 4127) 
dt.query('Name == "Sonic the Hedgehog"')
Out[13]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
257 Sonic the Hedgehog GEN 1991.0 Platform Sega 3.03 0.91 0.26 0.13 4.34 NaN NaN NaN NaN NaN NaN
1745 Sonic the Hedgehog PS3 2006.0 Platform Sega 0.41 0.06 0.04 0.66 1.16 43.0 17.0 4.1 176.0 Sonic Team E10+
1996 Sonic the Hedgehog X360 2006.0 Platform Sega 0.44 0.48 0.00 0.11 1.04 46.0 38.0 4.4 455.0 Sega E10+
In [14]:
dt.loc[dt['Name'] == 'Madden NFL 13']
Out[14]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
507 Madden NFL 13 X360 2012.0 Sports Electronic Arts 2.53 0.15 0.0 0.17 2.86 81.0 36.0 5.8 179.0 EA Tiburon E
604 Madden NFL 13 PS3 2012.0 Sports Electronic Arts 2.11 0.22 0.0 0.23 2.56 83.0 22.0 5.5 101.0 EA Tiburon E
3986 Madden NFL 13 Wii 2012.0 Sports Electronic Arts 0.47 0.00 0.0 0.03 0.50 NaN NaN 7.3 4.0 EA Tiburon E
5887 Madden NFL 13 PSV 2012.0 Sports Electronic Arts 0.28 0.00 0.0 0.02 0.30 63.0 6.0 7.3 38.0 EA Tiburon E
7067 Madden NFL 13 WiiU 2012.0 Sports Electronic Arts 0.21 0.00 0.0 0.02 0.23 75.0 9.0 6.7 30.0 EA Tiburon E
16233 Madden NFL 13 PS3 2012.0 Sports Electronic Arts 0.00 0.01 0.0 0.00 0.01 83.0 22.0 5.5 101.0 EA Tiburon E
In [15]:
dt = adding_sales(dt,604, 16233) 
dt.query('Name == "Madden NFL 13"')
Out[15]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
507 Madden NFL 13 X360 2012.0 Sports Electronic Arts 2.53 0.15 0.00 0.17 2.86 81.0 36.0 5.8 179.0 EA Tiburon E
604 Madden NFL 13 PS3 2012.0 Sports Electronic Arts 2.11 0.22 0.01 0.23 2.57 83.0 22.0 5.5 101.0 EA Tiburon E
3986 Madden NFL 13 Wii 2012.0 Sports Electronic Arts 0.47 0.00 0.00 0.03 0.50 NaN NaN 7.3 4.0 EA Tiburon E
5887 Madden NFL 13 PSV 2012.0 Sports Electronic Arts 0.28 0.00 0.00 0.02 0.30 63.0 6.0 7.3 38.0 EA Tiburon E
7067 Madden NFL 13 WiiU 2012.0 Sports Electronic Arts 0.21 0.00 0.00 0.02 0.23 75.0 9.0 6.7 30.0 EA Tiburon E
In [16]:
dt.loc[dt['Name'] == 'Need for Speed: Most Wanted']
Out[16]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
253 Need for Speed: Most Wanted PS2 2005.0 Racing Electronic Arts 2.03 1.79 0.08 0.47 4.37 82.0 36.0 9.1 137.0 EA Canada T
523 Need for Speed: Most Wanted PS3 2012.0 Racing Electronic Arts 0.71 1.46 0.06 0.58 2.81 NaN NaN NaN NaN NaN NaN
1190 Need for Speed: Most Wanted X360 2012.0 Racing Electronic Arts 0.62 0.78 0.01 0.15 1.56 83.0 54.0 8.5 134.0 EA Canada T
1591 Need for Speed: Most Wanted X360 2005.0 Racing Electronic Arts 1.00 0.13 0.02 0.10 1.25 83.0 54.0 8.5 134.0 EA Canada T
1998 Need for Speed: Most Wanted XB 2005.0 Racing Electronic Arts 0.53 0.46 0.00 0.05 1.04 83.0 32.0 8.8 29.0 EA Canada T
2048 Need for Speed: Most Wanted PSV 2012.0 Racing Electronic Arts 0.33 0.45 0.01 0.22 1.01 NaN NaN NaN NaN NaN NaN
3581 Need for Speed: Most Wanted GC 2005.0 Racing Electronic Arts 0.43 0.11 0.00 0.02 0.56 80.0 18.0 9.1 22.0 EA Canada T
5973 Need for Speed: Most Wanted PC 2005.0 Racing Electronic Arts 0.02 0.23 0.00 0.04 0.29 82.0 19.0 8.5 525.0 Black Box T
6274 Need for Speed: Most Wanted WiiU 2013.0 Racing Electronic Arts 0.13 0.12 0.00 0.02 0.27 NaN NaN NaN NaN NaN NaN
6411 Need for Speed: Most Wanted DS 2005.0 Racing Electronic Arts 0.24 0.01 0.00 0.02 0.27 45.0 4.0 6.1 22.0 EA Canada E
6474 Need for Speed: Most Wanted GBA 2005.0 Racing Electronic Arts 0.19 0.07 0.00 0.00 0.26 NaN NaN 8.3 14.0 EA Canada E
11716 Need for Speed: Most Wanted PC 2012.0 Racing Electronic Arts 0.00 0.06 0.00 0.02 0.08 82.0 19.0 8.5 525.0 Black Box T
In [17]:
#Xbox 360 Platform
print(dt.loc[1190,'Name'])
print(dt.loc[1591,'Name'])

#PC Platform
print(dt.loc[5973,'Name'])
print(dt.loc[11716,'Name'])
Need for Speed: Most Wanted
Need for Speed: Most Wanted
Need for Speed: Most Wanted
Need for Speed: Most Wanted
In [18]:
dt.loc[1190,'Name'] = 'Need for Speed: Most Wanted 2005'
dt.loc[1591,'Name'] = 'Need for Speed: Most Wanted 2012'
dt.loc[5973,'Name'] = 'Need for Speed: Most Wanted 2005'
dt.loc[11716,'Name'] = 'Need for Speed: Most Wanted 2012'

dt.loc[[1190, 1591,5973,11716],'Name']
Out[18]:
1190     Need for Speed: Most Wanted 2005
1591     Need for Speed: Most Wanted 2012
5973     Need for Speed: Most Wanted 2005
11716    Need for Speed: Most Wanted 2012
Name: Name, dtype: object
In [ ]:
 
In [19]:
import pandas as pd
import numpy as np

def missing_score(dt):

    dt['User_Score'] = pd.to_numeric(dt['User_Score'], errors='coerce')
    
    new_df = pd.DataFrame(columns=dt.columns)
    
    # Group the DataFrame by game name
    grouped = dt.groupby('Name')
    
    # Loop through each group
    for name, group in grouped:
        # Get the mean scores for this game
        user_mean = group['User_Score'].mean()
        critic_mean = group['Critic_Score'].mean()
        user_mean1 = group['User_Count'].mean()
        critic_mean1 = group['Critic_Count'].mean()
        #print(f"Game: {name}, User Mean Score: {user_mean}, Critic Mean Score: {critic_mean}")
        
        # Replace missing user scores with the mean
        group.loc[group['User_Score'].isnull(), 'User_Score'] = user_mean
        
        # Replace missing critic scores with the mean
        group.loc[group['Critic_Score'].isnull(), 'Critic_Score'] = critic_mean
        
        group.loc[group['User_Count'].isnull(), 'User_Count'] = user_mean1
        
        # Replace missing critic scores with the mean
        group.loc[group['Critic_Count'].isnull(), 'Critic_Count'] = critic_mean1
        
        new_df = pd.concat([new_df, group])
        
        new_df = new_df.reset_index(drop=True)
    
    return new_df
In [20]:
dt = missing_score(dt)
In [21]:
dt.isna().sum()
Out[21]:
Name                  0
Platform              0
Year_of_Release     268
Genre                 0
Publisher            53
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       7619
Critic_Count       7619
User_Score         8028
User_Count         8028
Developer          6621
Rating             6767
dtype: int64
In [22]:
dt.dropna(subset=['Year_of_Release'], inplace=True)
dt.dropna(subset=['Publisher'], inplace=True)
dt.dropna(subset=['Critic_Score'], inplace=True)
dt.dropna(subset=['User_Score'], inplace=True)
dt.dropna(subset=['User_Count'], inplace=True)
dt.dropna(subset=['Critic_Count'], inplace=True)
dt.dropna(subset=['Developer'], inplace=True)
dt.dropna(subset=['Rating'], inplace=True)
In [23]:
dt.isna().sum()
Out[23]:
Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Other_Sales        0
Global_Sales       0
Critic_Score       0
Critic_Count       0
User_Score         0
User_Count         0
Developer          0
Rating             0
dtype: int64
In [24]:
dt['Year_of_Release'] = dt['Year_of_Release'].astype('int')
dt['User_Score'] = dt['User_Score'].astype('float')
dt['NA_Sales'] = dt['NA_Sales'].astype('float')
dt['EU_Sales'] = dt['EU_Sales'].astype('float')
dt['JP_Sales'] = dt['JP_Sales'].astype('float')
dt['Other_Sales'] = dt['Other_Sales'].astype('float')
dt['Global_Sales'] = dt['Global_Sales'].astype('float')
dt['Critic_Score'] = dt['Critic_Score'].astype('float')
dt['Critic_Count'] = dt['Critic_Count'].astype('int')
#dt[['Genre','Rating']] = dt[['Genre','Rating']].astype('category')
dt['User_Count'] = dt['User_Count'].astype('int')
dt.dtypes
Out[24]:
Name                object
Platform            object
Year_of_Release      int32
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count         int32
User_Score         float64
User_Count           int32
Developer           object
Rating              object
dtype: object
In [25]:
dt['User_Score'] = dt['User_Score']*10  #User score was from 1-10 and critic score was 1-100 so we normalized the scale.
In [70]:
df = dt

Model¶

In [ ]:
corr = dt.corr()

fig, ax = plt.subplots(figsize=(11,10))
sns.heatmap(corr, annot=True)

plt.show()
In [ ]:
dt.dtypes
In [26]:
features1 = dt[['Critic_Score','User_Score','NA_Sales']]
class1 = dt[['Global_Sales']]
In [ ]:
 
In [27]:
features_train, features_test, class_train, class_test = train_test_split(features1, class1, test_size=0.15, random_state=0)
In [29]:
class_test.index
Out[29]:
Int64Index([ 6650, 12766,  7006, 11993, 15908, 11195, 11562,   356,  4045,
             5766,
            ...
             4086,  1000,   615, 13894, 13604,  2937, 14063, 15608,  9378,
             4184],
           dtype='int64', length=1172)
In [30]:
names_test = dt.loc[class_test.index, 'Name'].values
In [28]:
features_train= features_train.to_numpy()
features_test= features_test.to_numpy()
class_train= class_train.to_numpy()
class_test= class_test.to_numpy()
In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
In [33]:
model = LinearRegression()
#model = SVR(kernel='linear')
model.fit(features_train, class_train.ravel())
y_pred = model.predict(features_test)
accuracy = r2_score(class_test, y_pred)
print(accuracy)
0.8805100296458697

VISUAL for RESEARCH Q1 showing predicted sales for 1 game using 3 models¶

In [41]:
#LR
model = LinearRegression()
model.fit(features_train, class_train.ravel())
y_pred = model.predict(features_test)
accuracy = r2_score(class_test, y_pred)
rmse = np.sqrt(mean_squared_error(class_test, y_pred))
mae = mean_absolute_error(class_test, y_pred)
print(accuracy)
print(rmse)
print(mae)

#SVR
model1 = SVR(kernel='linear')
model1.fit(features_train, class_train.ravel())
y_pred1 = model1.predict(features_test)
accuracy1 = r2_score(class_test, y_pred1)
rmse1 = np.sqrt(mean_squared_error(class_test, y_pred1))
mae1 = mean_absolute_error(class_test, y_pred1)
print(accuracy1)
print(rmse1)
print(mae1)

#RF
model2 = RandomForestRegressor(n_estimators=100)
model2.fit(features_train, class_train.ravel())
y_pred2 = model2.predict(features_test)
accuracy2 = r2_score(class_test, y_pred2)
rmse2 = np.sqrt(mean_squared_error(class_test, y_pred2))
mae2 = mean_absolute_error(class_test, y_pred2)
print(accuracy2)
print(rmse2)
print(mae2)

#DT
from sklearn.tree import DecisionTreeRegressor
# Initialize the model
model3 = DecisionTreeRegressor()
model3.fit(features_train, class_train.ravel())
y_pred3 = model3.predict(features_test)
accuracy3 = r2_score(class_test, y_pred3)
rmse3 = np.sqrt(mean_squared_error(class_test, y_pred3))
mae3 = mean_absolute_error(class_test, y_pred3)
print(accuracy3)
print(rmse3)
print(mae3)

lr = model
svr = model1
rf = model2
dtx=model3
0.8805100296458697
0.5952654121797615
0.22208546809051266
0.8663012071293303
0.6296637116330522
0.21095203617585212
0.867757407324701
0.626225285703206
0.23651090907551878
0.7377533739998539
0.881860472369909
0.318764220705347
In [ ]:
 
In [63]:
import seaborn as sns
import matplotlib.pyplot as plt

models = ['Linear Regression', 'SVR', 'Random Forest','Decision Tree']
r_squared_values = [accuracy, accuracy1, accuracy2, accuracy3]

plt.figure(figsize=(8, 6))
ax = sns.pointplot(x=models, y=r_squared_values)
ax.set(xlabel='Models', ylabel='R-squared', title='Comparison of Accuracy(R-square)')
plt.ylim(0.65, 1.0)
plt.show()
In [61]:
import seaborn as sns
import matplotlib.pyplot as plt

models = ['Linear Regression', 'SVR',  'Random Forest','Decision Tree']
r_squared_values = [rmse, rmse1, rmse2, rmse3]

plt.figure(figsize=(8, 6))
ax = sns.pointplot(x=models, y=r_squared_values)
ax.set(xlabel='Models', ylabel='RMSE', title='Comparison of Root Mean Square Error')
plt.ylim(0.5, 1.0)
plt.show()
In [58]:
import seaborn as sns
import matplotlib.pyplot as plt

models = ['Linear Regression', 'SVR',  'Random Forest','Decision Tree']
r_squared_values = [mae, mae1, mae2, mae3]

plt.figure(figsize=(8, 6))
ax = sns.pointplot(x=models, y=r_squared_values)
ax.set(xlabel='Models', ylabel='RMSE', title='Comparison of Mean Absolute Error')
plt.ylim(0.15, 0.5)
plt.show()
In [34]:
## RUN SVR ONLY IF NECESSARY (TAKES TOO LONG TO EXECUTE)


game_index = np.random.choice(range(len(features_test)))
game_features = features_test[game_index].reshape(1, -1)
game_actual_sales = class_test[game_index]

# Predict the global sales of the game using the three models
game_lr_predicted_sales = lr.predict(game_features)
game_svr_predicted_sales = svr.predict(game_features)
game_rf_predicted_sales = rf.predict(game_features)
game_dt_predicted_sales = dtx.predict(game_features)

plt.figure(figsize=(8,6))
# Plot the predicted global sales vs. actual global sales for the game
plt.scatter(game_actual_sales, game_lr_predicted_sales, label='Linear Regression')
plt.scatter(game_actual_sales, game_svr_predicted_sales, label='Support Vector Regression')
plt.scatter(game_actual_sales, game_rf_predicted_sales, label='Random Forest Regression')
plt.scatter(game_actual_sales, game_dt_predicted_sales, label='Decision Tree Regression')
plt.xlabel('Actual Global Sales')
plt.ylabel('Predicted Global Sales')
plt.title('Comparison of Regression Models for ' + names_test[game_index])
plt.legend()
plt.show()
In [259]:
game_actual_sales
Out[259]:
array([0.15])
In [260]:
game_lr_predicted_sales
Out[260]:
array([0.17183536])
In [261]:
game_svr_predicted_sales
Out[261]:
array([0.18291265])
In [262]:
game_dt_predicted_sales
Out[262]:
array([0.09])
In [263]:
game_rf_predicted_sales
Out[263]:
array([0.13])

MODELLING Q2¶

In [64]:
from sklearn.preprocessing import LabelEncoder
In [ ]:
#TEST CODE#
In [65]:
# Encode the Genre and Publisher columns using LabelEncoder
le_g = LabelEncoder()
le_p = LabelEncoder()

dt['Genre'] = le_g.fit_transform(dt['Genre'])
dt['Publisher'] = le_p.fit_transform(dt['Publisher'])

# Split the dataset into training and testing sets
X = dt[['Year_of_Release','Genre','Publisher','NA_Sales']]
y = dt['Global_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

# Train a linear regression model on the training set
model = LinearRegression()
model.fit(X_train, y_train)

#model = RandomForestRegressor(n_estimators=50, random_state=0)
#model.fit(X_train, y_train.ravel())
#y_pred1 = model1.predict(features_test)


# Evaluate the model on the testing set
y_pred = model.predict(X_test)
In [66]:
dt['Genre'] = le_g.inverse_transform(dt['Genre'].astype(int))
dt['Publisher'] = le_p.inverse_transform(dt['Publisher'].astype(int))
In [67]:
dt
Out[67]:
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
5 Tales of Xillia 2 PS3 2012 Role-Playing Namco Bandai Games 0.20 0.12 0.45 0.07 0.84 71.0 59 79.0 216 Bandai Namco Games T
11 .hack//Infection Part 1 PS2 2002 Role-Playing Atari 0.49 0.38 0.26 0.13 1.27 75.0 35 85.0 60 CyberConnect2 T
13 .hack//Mutation Part 2 PS2 2002 Role-Playing Atari 0.23 0.18 0.20 0.06 0.68 76.0 24 89.0 81 CyberConnect2 T
14 .hack//Outbreak Part 3 PS2 2002 Role-Playing Atari 0.14 0.11 0.17 0.04 0.46 70.0 23 87.0 19 CyberConnect2 T
17 007 Racing PS 2000 Racing Electronic Arts 0.30 0.20 0.00 0.03 0.53 51.0 16 46.0 14 Eutechnyx T
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16702 pro evolution soccer 2011 X360 2010 Sports Konami Digital Entertainment 0.09 0.44 0.00 0.07 0.61 79.0 43 59.0 33 Konami E
16703 pro evolution soccer 2011 PS2 2010 Sports Konami Digital Entertainment 0.04 0.21 0.05 0.11 0.41 77.4 22 67.0 7 Konami E
16704 pro evolution soccer 2011 Wii 2010 Sports Konami Digital Entertainment 0.07 0.10 0.03 0.02 0.22 78.0 9 54.0 7 Konami E
16711 uDraw Studio: Instant Artist Wii 2011 Misc THQ 0.06 0.09 0.00 0.02 0.17 54.0 5 57.0 6 THQ E
16712 uDraw Studio: Instant Artist X360 2011 Misc THQ 0.01 0.01 0.00 0.00 0.02 54.0 5 57.0 6 THQ E

7810 rows × 16 columns

In [68]:
# Get the top publisher for each genre based on NA_Sales and Global_Sales
top_publishers = dt.groupby('Genre').agg({
    'Publisher': lambda x: x.value_counts().index[0],
    'NA_Sales': 'mean',
    'Global_Sales': 'mean'
}).reset_index()
In [69]:
top_publishers
Out[69]:
Genre Publisher NA_Sales Global_Sales
0 Action Activision 0.338090 0.682804
1 Adventure Ubisoft 0.149965 0.311150
2 Fighting Namco Bandai Games 0.338454 0.620749
3 Misc Ubisoft 0.495267 0.922305
4 Platform THQ 0.424417 0.820961
5 Puzzle Nintendo 0.274385 0.636462
6 Racing Electronic Arts 0.349468 0.729911
7 Role-Playing Square Enix 0.303803 0.696430
8 Shooter Electronic Arts 0.500686 0.912144
9 Simulation Electronic Arts 0.305210 0.670744
10 Sports Electronic Arts 0.419583 0.768707
11 Strategy THQ 0.121828 0.254122
In [70]:
# Create new rows for each genre-publisher combination with Year as 2017
new_games = []
for _, row in top_publishers.iterrows():
    new_game = {'Year_of_Release': 2017, 'Genre': row['Genre'], 'Publisher': row['Publisher'], 'NA_Sales': row['NA_Sales'], 'Global_Sales': row['Global_Sales']}
    new_games.append(new_game)
new_games = pd.DataFrame(new_games)
In [71]:
new_games
Out[71]:
Year_of_Release Genre Publisher NA_Sales Global_Sales
0 2017 Action Activision 0.338090 0.682804
1 2017 Adventure Ubisoft 0.149965 0.311150
2 2017 Fighting Namco Bandai Games 0.338454 0.620749
3 2017 Misc Ubisoft 0.495267 0.922305
4 2017 Platform THQ 0.424417 0.820961
5 2017 Puzzle Nintendo 0.274385 0.636462
6 2017 Racing Electronic Arts 0.349468 0.729911
7 2017 Role-Playing Square Enix 0.303803 0.696430
8 2017 Shooter Electronic Arts 0.500686 0.912144
9 2017 Simulation Electronic Arts 0.305210 0.670744
10 2017 Sports Electronic Arts 0.419583 0.768707
11 2017 Strategy THQ 0.121828 0.254122
In [72]:
# Fit the label encoder on the entire dataset
genre_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()
genre_encoder.fit(new_games['Genre'])
publisher_encoder.fit(new_games['Publisher'])
Out[72]:
LabelEncoder()
In [73]:
# Encode the genre and publisher columns in the new_games dataset
new_games['Genre'] = genre_encoder.transform(new_games['Genre'])
new_games['Publisher'] = publisher_encoder.transform(new_games['Publisher'])
In [74]:
# Select the features to use for prediction
features = ['Year_of_Release', 'Genre', 'Publisher', 'NA_Sales']

# Use the trained model to predict the expected global sales for each new game
expected_sales = model.predict(new_games[features])

# Add the predicted sales to the new games dataframe
new_games['Expected_Global_Sales'] = expected_sales
In [75]:
new_games
Out[75]:
Year_of_Release Genre Publisher NA_Sales Global_Sales Expected_Global_Sales
0 2017 0 0 0.338090 0.682804 0.728906
1 2017 1 6 0.149965 0.311150 0.365073
2 2017 2 2 0.338454 0.620749 0.728611
3 2017 3 6 0.495267 0.922305 1.032494
4 2017 4 5 0.424417 0.820961 0.894333
5 2017 5 3 0.274385 0.636462 0.602583
6 2017 6 1 0.349468 0.729911 0.746881
7 2017 7 4 0.303803 0.696430 0.658353
8 2017 8 1 0.500686 0.912144 1.038362
9 2017 9 1 0.305210 0.670744 0.659016
10 2017 10 1 0.419583 0.768707 0.879842
11 2017 11 5 0.121828 0.254122 0.303232
In [76]:
# Transform the Genre and Publisher columns using the same LabelEncoder objects
new_games['Genre'] = genre_encoder.inverse_transform(new_games['Genre'].astype(int))
new_games['Publisher'] = publisher_encoder.inverse_transform(new_games['Publisher'].astype(int))

# Show the final new_games dataframe
print(new_games)
    Year_of_Release         Genre           Publisher  NA_Sales  Global_Sales  \
0              2017        Action          Activision  0.338090      0.682804   
1              2017     Adventure             Ubisoft  0.149965      0.311150   
2              2017      Fighting  Namco Bandai Games  0.338454      0.620749   
3              2017          Misc             Ubisoft  0.495267      0.922305   
4              2017      Platform                 THQ  0.424417      0.820961   
5              2017        Puzzle            Nintendo  0.274385      0.636462   
6              2017        Racing     Electronic Arts  0.349468      0.729911   
7              2017  Role-Playing         Square Enix  0.303803      0.696430   
8              2017       Shooter     Electronic Arts  0.500686      0.912144   
9              2017    Simulation     Electronic Arts  0.305210      0.670744   
10             2017        Sports     Electronic Arts  0.419583      0.768707   
11             2017      Strategy                 THQ  0.121828      0.254122   

    Expected_Global_Sales  
0                0.728906  
1                0.365073  
2                0.728611  
3                1.032494  
4                0.894333  
5                0.602583  
6                0.746881  
7                0.658353  
8                1.038362  
9                0.659016  
10               0.879842  
11               0.303232  
In [77]:
new_games
Out[77]:
Year_of_Release Genre Publisher NA_Sales Global_Sales Expected_Global_Sales
0 2017 Action Activision 0.338090 0.682804 0.728906
1 2017 Adventure Ubisoft 0.149965 0.311150 0.365073
2 2017 Fighting Namco Bandai Games 0.338454 0.620749 0.728611
3 2017 Misc Ubisoft 0.495267 0.922305 1.032494
4 2017 Platform THQ 0.424417 0.820961 0.894333
5 2017 Puzzle Nintendo 0.274385 0.636462 0.602583
6 2017 Racing Electronic Arts 0.349468 0.729911 0.746881
7 2017 Role-Playing Square Enix 0.303803 0.696430 0.658353
8 2017 Shooter Electronic Arts 0.500686 0.912144 1.038362
9 2017 Simulation Electronic Arts 0.305210 0.670744 0.659016
10 2017 Sports Electronic Arts 0.419583 0.768707 0.879842
11 2017 Strategy THQ 0.121828 0.254122 0.303232
In [78]:
import plotly.express as px

plt.figure(figsize=(16,9))
# assuming your dataframe is called 'df'
fig = px.bar(new_games, x='Genre', y='Global_Sales', color='Publisher', hover_data=['Publisher'])
# update hover information to only display publisher
#fig.update_traces(hovertemplate='<br>'.join(['Publisher: %{customdata}']))


fig.show()
<Figure size 1600x900 with 0 Axes>
In [124]:
### MINING QUESTION HOW WILL THE GAME SALES BE IN YEAR 2017 BASED ON GENRE, (DATA IS UPTO YEAR 2016)
In [112]:
#sns.set(style="darkgrid")
df = new_games

# Set the width of each bar and the positions of the bars on the x-axis
bar_width = 0.35
x_pos = np.arange(len(df['Genre']))

# Create a figure and axis objects with a larger size
fig, ax = plt.subplots(figsize=(16, 9))

# Create bars for global sales and expected global sales for each publisher
global_sales_bars = ax.bar(x_pos - bar_width/2, df['Global_Sales'], bar_width, label='Global Sales')
expected_global_sales_bars = ax.bar(x_pos + bar_width/2, df['Expected_Global_Sales'], bar_width, label='Expected Global Sales')

# Add labels, title, and legend
ax.set_ylabel('Sales')
#ax.set_title('Global Sales by Publisher')
ax.set_xticks(x_pos)
ax.set_xticklabels(df['Genre'])
ax.legend()

# Show the plot
plt.show()
In [80]:
import plotly.graph_objects as go
df = new_games

colors = [sns.color_palette()[0], sns.color_palette()[1]]
palette = sns.color_palette(['#0072B2', '#D55E00']).as_hex()

# Create a figure with the bar chart
fig = px.bar(df, x='Genre', y=['Global_Sales', 'Expected_Global_Sales'], barmode= 'group',
             hover_data=['Publisher'],color_discrete_sequence=palette)

#fig.update_traces(marker=['blue', 'orange'])

# Customize the hover text to show the top publisher and sales type
fig.update_traces(hovertemplate='Publisher: %{customdata[0]}<br>'
                                  )

# Set the figure layout and title
fig.update_layout(
    title='Global Sales by Genre',
    xaxis_title='Genre',
    yaxis_title='Sales'
)

# Show the plot
fig.show()
In [126]:
##Q 2
from sklearn.svm import SVR
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
In [127]:
from sklearn.svm import SVR
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = dt[['Year_of_Release','Critic_Score','User_Score','NA_Sales']]
y = dt[['Global_Sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
X_train= X_train.to_numpy()
X_test= X_test.to_numpy()
y_train= y_train.to_numpy()
y_test= y_test.to_numpy()


model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



#model = SVR(kernel='linear')

# Train the model on the training set
#model.fit(X_train, y_train)

# Make predictions on the testing set
#y_pred = model.predict(X_test)

# Evaluate the performance of the model
#mse = mean_squared_error(y_test, y_pred)
#print("Mean squared error: ", mse)

from sklearn.ensemble import RandomForestRegressor


#model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
#model.fit(X_train, y_train)

# Make predictions on the testing set
#y_pred = model.predict(X_test)


accuracy = r2_score(y_test, y_pred)
print(accuracy)
0.8811877796971053
In [128]:
Test_Data = pd.DataFrame(X_test)
In [174]:
Test_Data[0] = Test_Data[0].astype(int)
Test_Data
Out[174]:
0 1 2 3
0 2013 71.0 70.0 0.30
1 2001 89.0 86.0 0.00
2 2002 81.0 89.0 1.22
3 2008 63.0 79.0 0.09
4 2006 55.0 74.0 0.06
... ... ... ... ...
1167 2013 60.0 72.0 0.38
1168 2004 75.0 88.0 0.08
1169 2008 74.0 68.0 0.22
1170 2008 67.0 84.0 0.11
1171 2015 65.0 63.0 0.09

1172 rows × 4 columns

In [187]:
dff = Test_Data.sample(n=5)#,replace=False)
dff
Out[187]:
0 1 2 3
497 2008 47.0 70.0 0.49
726 2003 76.0 74.0 0.01
416 2012 88.0 83.0 0.48
140 2011 56.0 57.0 0.02
10 2010 63.0 72.0 0.23
In [ ]:
 

import random import pandas as pd

Define the original Critic scores and NA Sales¶

critic_scores = dff[0] na_sales = dff[2]

Select 5 user scores from xtest¶

original_user_scores = dff[1]

Create 5 sets of 5 random user scores¶

random_user_scores_sets = [] for i in range(5): random_user_scores = random.sample(range(101), 5) random_user_scores_set = [random_userscores for in range(5)] random_user_scores_sets.append(random_user_scores_set)

user_scores_sets = [list(critic_scores) + random_user_scores_set[i] + list(na_sales) for i in range(5)]

Concatenate original and random user scores for each set¶

df_list = [] for i in range(5): random_user_scores_set = random_user_scores_sets[i] user_scores = list(original_user_scores) + random_user_scores_set df = pd.DataFrame({'Critic score': critic_scores5, 'User score': user_scores, 'NA Sales': na_sales5}) df_list.append(df)

import pandas as pd import random

Read in the original dataframe with Critic score, User score, and NA Sales¶

df_test = dff df_test.columns = ['Critic score', 'User score', 'NA Sales']

Define the original Critic scores and NA Sales¶

critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales']

Create a list of 5 random user score sets¶

random_user_scores_sets = [random.sample(range(101), 5)] * len(df_test['User score'].unique())

Flatten the user score sets into a single list¶

user_scores = [] for score in df_test['User score'].unique(): user_scores += [score] + random_user_scores_sets

Repeat the original Critic scores and NA Sales for each set of random user scores¶

critic_scores = list(critic_scores) len(df_test['User score'].unique()) na_sales = list(na_sales) len(df_test['User score'].unique())

Create a dataframe with the correct length for each column¶

df = pd.DataFrame({'Critic score': critic_scores 5, 'User score': user_scores 5, 'NA Sales': na_sales * 5})

import pandas as pd

Sample dataframe with year_of_release column¶

df =dff df.columns = ['Year_of_Release','Critic score', 'User score', 'NA Sales']

critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales'] year = df_test['Year_of_Release'] user = df_test['User score']

Define a function to generate 5 consecutive year values for a given year¶

def generate_consecutive_years(year): return list(range(year + 1, year + 6))

Create a list of 5 consecutive year sets for each unique year in the dataframe¶

consecutive_years_sets = [] for year in df['Year_of_Release']: consecutive_years_sets += [generate_consecutive_years(year)] * 5

Flatten the consecutive year sets into a single list¶

consecutive_years = [] for year in df['Year_of_Release'].unique(): consecutive_years += [year] * 5 + consecutive_years_sets.pop(0)

critic_scores = np.repeat(df_test['Critic score'], 6).tolist() na_sales = np.repeat(df_test['NA Sales'], 6).tolist() user = np.repeat(df_test['User score'],6).tolist()

Add the consecutive year values to the dataframe¶

df['Consecutive Years'] = consecutive_years

NEW TEST for Year¶

import pandas as pd import random

Read in the original dataframe with Critic score, User score, and NA Sales¶

df_test = dff df_test.columns = ['Year_of_Release','Critic score', 'User score', 'NA Sales']

Define the original Critic scores and NA Sales¶

critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales'] year = df_test['Year_of_Release'] user = df_test['User score'] value_set = [2016,2017,2018,2019,2020] random_user_scores_sets = [valueset[:] for in range(5)]

Create a list of 5 random user score sets¶

random_user_scores_sets = [random.sample(range(101), 5)] * 5¶

Flatten the user score sets into a single list¶

user_scores = [] for score in df_test['Year_of_Release']: user_scores += [score] user_scores += random_user_scores_sets.pop(0)

Repeat the original Critic scores and NA Sales for each set of random user scores¶

critic_scores = list(critic_scores) * 6¶

na_sales = list(na_sales) * 6¶

Repeat the critic score and NA sales 5 times each¶

critic_scores = np.repeat(df_test['Critic score'], 6).tolist() na_sales = np.repeat(df_test['NA Sales'], 6).tolist() user = np.repeat(df_test['User score'],6).tolist()

Create a dataframe with the correct length for each column¶

dfx = pd.DataFrame({'Year_of_Release': user_scores, 'Critic score': critic_scores, 'User score': user, 'NA Sales': na_sales})

In [ ]:
#NEW TEST FOR CONSECUtive years
In [131]:
#dff[0] = dff[0].astype(int)
In [188]:
import pandas as pd
import numpy as np

# Read in the original dataframe with Critic score, User score, and NA Sales
df_test = dff
df_test.columns = ['Year_of_Release','Critic score', 'User score', 'NA Sales']

# Define the original Critic scores and NA Sales
critic_scores = df_test['Critic score']
na_sales = df_test['NA Sales']
user = df_test['User score']


# Create a list of 5 consecutive years for each unique value of Year_of_Release
year_sets = []
for year in df_test['Year_of_Release']:
    year_set = list(range(year, year+6))
    year_sets += [year_set[1:]]

# Flatten the year sets into a single list
year_of_release = []
for year in df_test['Year_of_Release']:
    year_of_release += [year]
    year_of_release += year_sets.pop(0)

# Repeat the original Critic scores and NA Sales for each set of random user scores
#critic_scores = list(critic_scores) * 6
#na_sales = list(na_sales) * 6

# Repeat the critic score and NA sales 5 times each
critic_scores = np.repeat(df_test['Critic score'], 6).tolist()
na_sales = np.repeat(df_test['NA Sales'], 6).tolist()
user = np.repeat(df_test['User score'],6).tolist()

# Create a dataframe with the correct length for each column
dfx = pd.DataFrame({'Year_of_Release': year_of_release,
                    'Critic score': critic_scores,
                    'User score': user,
                    'NA Sales': na_sales})
In [189]:
dfx
Out[189]:
Year_of_Release Critic score User score NA Sales
0 2008 47.0 70.0 0.49
1 2009 47.0 70.0 0.49
2 2010 47.0 70.0 0.49
3 2011 47.0 70.0 0.49
4 2012 47.0 70.0 0.49
5 2013 47.0 70.0 0.49
6 2003 76.0 74.0 0.01
7 2004 76.0 74.0 0.01
8 2005 76.0 74.0 0.01
9 2006 76.0 74.0 0.01
10 2007 76.0 74.0 0.01
11 2008 76.0 74.0 0.01
12 2012 88.0 83.0 0.48
13 2013 88.0 83.0 0.48
14 2014 88.0 83.0 0.48
15 2015 88.0 83.0 0.48
16 2016 88.0 83.0 0.48
17 2017 88.0 83.0 0.48
18 2011 56.0 57.0 0.02
19 2012 56.0 57.0 0.02
20 2013 56.0 57.0 0.02
21 2014 56.0 57.0 0.02
22 2015 56.0 57.0 0.02
23 2016 56.0 57.0 0.02
24 2010 63.0 72.0 0.23
25 2011 63.0 72.0 0.23
26 2012 63.0 72.0 0.23
27 2013 63.0 72.0 0.23
28 2014 63.0 72.0 0.23
29 2015 63.0 72.0 0.23

import pandas as pd import random

Read in the original dataframe with Critic score, User score, and NA Sales¶

df_test = dfx

df_test.columns = ['Critic score', 'User score', 'NA Sales']¶

Define the original Critic scores and NA Sales¶

critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales']

value_set = [0,20,40,60,80] random_user_scores_sets = [valueset[:] for in range(5)]

Create a list of 5 random user score sets¶

random_user_scores_sets = [random.sample(range(101), 5)] * 5¶

Flatten the user score sets into a single list¶

user_scores = [] for score in df_test['User score']: user_scores += [score] user_scores += random_user_scores_sets.pop(0)

Repeat the original Critic scores and NA Sales for each set of random user scores¶

critic_scores = list(critic_scores) * 6¶

na_sales = list(na_sales) * 6¶

Repeat the critic score and NA sales 5 times each¶

critic_scores = np.repeat(df_test['Critic score'], 6).tolist() na_sales = np.repeat(df_test['NA Sales'], 6).tolist()

Create a dataframe with the correct length for each column¶

dfx = pd.DataFrame({'Critic score': critic_scores, 'User score': user_scores, 'NA Sales': na_sales})

for test_set in df:

# Extract the features (Critic score, User score, NA Sales) from the test set
X_test = test_set.iloc[:, [0, 1, 2]].values

# Make predictions using the trained model
y_pred = reg_model.predict(X_test)

# Print the predicted values for the test set
print(y_pred)
In [190]:
y_pred = model.predict(dfx)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but LinearRegression was fitted without feature names
  warnings.warn(
In [191]:
y_pred_df = pd.DataFrame(y_pred)
In [192]:
dfx['PGS'] = y_pred_df
In [193]:
dfx['Year_of_Release'] = dfx['Year_of_Release'].astype('int')
In [196]:
import matplotlib.pyplot as plt
import seaborn as sns

# Split the original dataframe into 5 dataframes
split_df = np.array_split(dfx, 5)

# Convert the year column to integer
for df in split_df:
    df['Year_of_Release'] = df['Year_of_Release'].astype(int)

# Plot the predicted global sales for each of the 5 dataframes
sns.set(style='white')
fig, ax = plt.subplots(figsize=(8, 6))
for i in range(5):
    ax.plot(split_df[i]['Year_of_Release'], split_df[i]['PGS'], label=f'Test Set {i+1}')
ax.legend()
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
plt.show()
In [197]:
split_df = np.array_split(dfx, 5)
In [198]:
ts1=split_df[0]
ts2=split_df[1]
ts3=split_df[2]
ts4=split_df[3]
ts5=split_df[4]
In [199]:
dfs = [ts1, ts2, ts3, ts4, ts5]
In [167]:
dfs
Out[167]:
[   Year_of_Release  Critic score  User score  NA Sales       PGS
 0             2016     80.333333        72.0      0.01  0.140353
 1             2017     80.333333        72.0      0.01  0.150622
 2             2018     80.333333        72.0      0.01  0.160890
 3             2019     80.333333        72.0      0.01  0.171158
 4             2020     80.333333        72.0      0.01  0.181427
 5             2021     80.333333        72.0      0.01  0.191695,
     Year_of_Release  Critic score  User score  NA Sales       PGS
 6              2004          72.0        84.0       0.4  0.759638
 7              2005          72.0        84.0       0.4  0.769906
 8              2006          72.0        84.0       0.4  0.780175
 9              2007          72.0        84.0       0.4  0.790443
 10             2008          72.0        84.0       0.4  0.800711
 11             2009          72.0        84.0       0.4  0.810980,
     Year_of_Release  Critic score  User score  NA Sales       PGS
 12             2003          81.0        81.0       0.3  0.573045
 13             2004          81.0        81.0       0.3  0.583314
 14             2005          81.0        81.0       0.3  0.593582
 15             2006          81.0        81.0       0.3  0.603850
 16             2007          81.0        81.0       0.3  0.614119
 17             2008          81.0        81.0       0.3  0.624387,
     Year_of_Release  Critic score  User score  NA Sales       PGS
 18             2009          43.0        71.0      0.27  0.493269
 19             2010          43.0        71.0      0.27  0.503537
 20             2011          43.0        71.0      0.27  0.513806
 21             2012          43.0        71.0      0.27  0.524074
 22             2013          43.0        71.0      0.27  0.534342
 23             2014          43.0        71.0      0.27  0.544611,
     Year_of_Release  Critic score  User score  NA Sales       PGS
 24             2008          70.0        63.0      0.07  0.147552
 25             2009          70.0        63.0      0.07  0.157820
 26             2010          70.0        63.0      0.07  0.168088
 27             2011          70.0        63.0      0.07  0.178357
 28             2012          70.0        63.0      0.07  0.188625
 29             2013          70.0        63.0      0.07  0.198893]
In [202]:
import matplotlib.pyplot as plt
import seaborn as sns

# plot the predicted global sales for dfs[0]
sns.set(style='white')
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(dfs[0]['Year_of_Release'], dfs[0]['PGS'], label=f'Test Set 1')
ax.legend()
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
plt.show()
In [215]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='whitegrid')

# plot the data for ts1 using sns.pointplot
fig, ax = plt.subplots(figsize=(8, 6))
sns.pointplot(x='Year_of_Release', y='PGS', data=ts1, ax=ax)
# set the axis labels and title
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
ax.set_title('Predicted Global Sales for Test Set 1')

# add the legend


plt.show()
In [216]:
#DUP

import matplotlib.pyplot as plt

# create a list of 5 DataFrames

sns.set(style='white')
# plot the predicted global sales for each of the 6 user sales in each DataFrame
#plt.figure(figsize=(12,6))
fig, ax = plt.subplots(figsize=(12, 6))
#fig, ax = plt.subplots(figsize=(16, 9))
for i in range(5):
    ax.plot(dfs[i]['Year_of_Release'], dfs[i]['PGS'], label=f'Test Set {i+1}')
ax.legend()
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
plt.show()